Convential wisdom is that money buys happiness (winning) in Major League Baseball. However, the advent of “Moneyball” in the early 2000s by the Oakland Athletics, Cleveland Indians and other teams has lead to a more analytical approach to detemining the make-up of Major League rosters.
As it turns out, money is not the magic elixer when it comes to assembling a winning Major League Baseball (MLB) team. The following plot shows that salary does not highly correlate with a winning record. This is substantiated by the companion single linear regression model and summary statistics that show that salary, while significant, only has a marginal impact on wins by an MLB team. The adjusted \(R^2\) from thesimple linear regression – using training data from 2000 through 2013 is low. Furthermore, the average percent error that compares actual wins versus predicted wins from the the test data (2014 through 2016) is high.
So what are the factors that move the needle?
We explore two related threads in attempting to identify the factors that have a strong impact on a winning baseball team.
The first thread uses multilear regression to identify the factors that influence a teams’s winning record over the course of a season. The second thread uses logistic regression to identify the factors that influence a team’s ability to win their division.
A few words are in order about where we obtained the data from to perform this analysis.
The source of data is the Sean Latham baseball archive (http://www.seanlahman.com/baseball-archive/), recognized by the Society for American Baseball Research (SABR) as the leading archive detailed player and team data from 1874 through the end of the 2017 Major League Baseball season. We use team statistics from the years 2000 through 2013 as training dataset and use data from 2014 through 2016 as the test dataset. (We joined multiple datasets together from this archive and one the datasets only has data throught the end of the 2016 season. Because of this, we were not able to include the 2017 season as part of this study.)
##
## Call:
## lm(formula = W ~ salary, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.389 -8.180 0.701 8.404 35.659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.70962155199 1.29065984697 55.560 < 2e-16 ***
## salary 0.00000011551 0.00000001468 7.866 0.0000000000000316 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.86 on 418 degrees of freedom
## Multiple R-squared: 0.1289, Adjusted R-squared: 0.1269
## F-statistic: 61.87 on 1 and 418 DF, p-value: 0.00000000000003158
## [1] "Leave One Out Cross-Validated RMSE: 10.88"
library(faraway)
scope <- W ~ R + AB + H + X2B + X3B + HR + BB + SO + SB + CS + HBP + SF + RA + ER + ERA + CG + SHO + SV + IPouts + HA + HRA + BBA + SOA + E + DP + FP + attendance + BPF + PPF + salary + RBI + GIDP + IBB + TB + SLG + OBP + OPS + WHIP + BABIP + RC + X1B + uBB + wOBA
formula <- formula(scope)
start_model <- lm(formula, data= bbproj_trn)
n <- length(resid(start_model))
step_search_start_model <- lm(W ~ 1, data= bbproj_trn)
### Backwards Search: BIC Model
bic_model <- step(start_model, direction = "backward", k = log(n),trace = 0)
summary(bic_model)
##
## Call:
## lm(formula = W ~ R + AB + H + X2B + BB + SO + CS + SF + RA +
## CG + SHO + SV + IPouts + HRA + BBA + E + FP + GIDP + BABIP +
## RC, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8573 -1.9757 -0.1555 1.9118 7.4927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1297.971349 339.322389 3.825 0.000152 ***
## R 0.076579 0.006140 12.472 < 2e-16 ***
## AB -0.116786 0.014360 -8.133 5.37e-15 ***
## H 0.306262 0.049740 6.157 1.81e-09 ***
## X2B 0.034285 0.010085 3.399 0.000743 ***
## BB 0.031072 0.006669 4.659 4.33e-06 ***
## SO 0.051706 0.009680 5.341 1.55e-07 ***
## CS -0.043025 0.015571 -2.763 0.005990 **
## SF -0.095773 0.021708 -4.412 1.32e-05 ***
## RA -0.054254 0.003811 -14.238 < 2e-16 ***
## CG 0.148895 0.046962 3.171 0.001639 **
## SHO 0.150078 0.049080 3.058 0.002380 **
## SV 0.345616 0.025203 13.714 < 2e-16 ***
## IPouts 0.057453 0.007126 8.063 8.78e-15 ***
## HRA -0.026131 0.008530 -3.063 0.002338 **
## BBA -0.008042 0.002619 -3.070 0.002287 **
## E -0.182853 0.055690 -3.283 0.001116 **
## FP -1045.006972 343.178691 -3.045 0.002480 **
## GIDP -0.044072 0.012147 -3.628 0.000322 ***
## BABIP -760.159504 137.620611 -5.524 6.00e-08 ***
## RC -0.115957 0.024057 -4.820 2.04e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.729 on 399 degrees of freedom
## Multiple R-squared: 0.9475, Adjusted R-squared: 0.9449
## F-statistic: 360.1 on 20 and 399 DF, p-value: < 2.2e-16
### Backwards Search: AIC Model
aic_model <- step(start_model, direction = "backward", trace = 0)
summary(aic_model)
##
## Call:
## lm(formula = W ~ R + AB + H + X2B + BB + SO + CS + SF + RA +
## CG + SHO + SV + IPouts + HRA + BBA + E + FP + BPF + PPF +
## salary + GIDP + BABIP + RC, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9901 -1.9391 -0.1523 1.9737 7.8582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1224.786161854707 345.895721393242 3.541 0.000446 ***
## R 0.080154881164 0.006191576342 12.946 < 2e-16 ***
## AB -0.115370180878 0.014302511665 -8.066 8.71e-15 ***
## H 0.294198894471 0.049836065902 5.903 7.66e-09 ***
## X2B 0.034434010453 0.010055404520 3.424 0.000680 ***
## BB 0.029747234634 0.006800119630 4.375 1.56e-05 ***
## SO 0.048815441474 0.009683749780 5.041 7.06e-07 ***
## CS -0.044627799778 0.015532577786 -2.873 0.004283 **
## SF -0.096303670130 0.021621972055 -4.454 1.10e-05 ***
## RA -0.057347471123 0.004031932044 -14.223 < 2e-16 ***
## CG 0.154701717514 0.046819638380 3.304 0.001039 **
## SHO 0.143225421132 0.048746440530 2.938 0.003495 **
## SV 0.346070608015 0.025076294754 13.801 < 2e-16 ***
## IPouts 0.058770788132 0.007084174725 8.296 1.71e-15 ***
## HRA -0.022515047089 0.008581255343 -2.624 0.009033 **
## BBA -0.008098302987 0.002628330947 -3.081 0.002206 **
## E -0.172337651761 0.056648920630 -3.042 0.002505 **
## FP -977.032306332758 348.883980530369 -2.800 0.005353 **
## BPF -0.639264997534 0.233137357795 -2.742 0.006383 **
## PPF 0.599162430181 0.234965627610 2.550 0.011148 *
## salary 0.000000008060 0.000000004559 1.768 0.077838 .
## GIDP -0.047142718419 0.012117297039 -3.891 0.000117 ***
## BABIP -723.253375876375 138.139293383544 -5.236 2.67e-07 ***
## RC -0.109319751926 0.024176237333 -4.522 8.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.704 on 396 degrees of freedom
## Multiple R-squared: 0.9489, Adjusted R-squared: 0.9459
## F-statistic: 319.4 on 23 and 396 DF, p-value: < 2.2e-16
### Step Search: BIC Model
step_bic_model <- step(step_search_start_model, scope = scope, direction = "both",k = log(n),trace = 0)
summary(step_bic_model)
##
## Call:
## lm(formula = W ~ SV + RA + CG + OBP + X3B + SHO + R, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7419 -1.9652 0.0052 1.8342 9.5000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.123870 6.025094 6.493 0.000000000242 ***
## SV 0.427352 0.025372 16.844 < 2e-16 ***
## RA -0.076829 0.002616 -29.364 < 2e-16 ***
## CG 0.152982 0.046478 3.292 0.001082 **
## OBP 65.425764 23.940331 2.733 0.006549 **
## X3B -0.063520 0.016876 -3.764 0.000192 ***
## SHO 0.174222 0.053140 3.279 0.001132 **
## R 0.079990 0.004115 19.439 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.01 on 412 degrees of freedom
## Multiple R-squared: 0.9341, Adjusted R-squared: 0.9329
## F-statistic: 833.7 on 7 and 412 DF, p-value: < 2.2e-16
### Step Search: AIC Model ###
step_aic_model <- step(step_search_start_model, scope = scope, direction = "both", trace = 0)
summary(step_aic_model)
##
## Call:
## lm(formula = W ~ SV + RA + CG + X3B + SHO + R + IPouts + AB +
## H + CS + GIDP + SF + BBA + HRA + E + FP + BPF + PPF + salary +
## HA, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5195 -1.9115 -0.1021 1.8888 8.0195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 914.034003144572 347.045241981607 2.634 0.008773
## SV 0.355614506804 0.024640390657 14.432 < 2e-16
## RA -0.050247955833 0.006340159551 -7.925 0.000000000000023
## CG 0.153017526099 0.044231471311 3.459 0.000600
## X3B -0.068574679286 0.016831959357 -4.074 0.000055748930342
## SHO 0.138199572874 0.049132489791 2.813 0.005154
## R 0.084827624066 0.003584776861 23.663 < 2e-16
## IPouts 0.061359854959 0.006880713970 8.918 < 2e-16
## AB -0.052857166713 0.005932828505 -8.909 < 2e-16
## H 0.049033080922 0.006741047553 7.274 0.000000000001865
## CS -0.046638144061 0.015078914455 -3.093 0.002121
## GIDP -0.043755357335 0.011669100346 -3.750 0.000203
## SF -0.052733758025 0.019580831875 -2.693 0.007377
## BBA -0.010442337072 0.003185502948 -3.278 0.001137
## HRA -0.021622522454 0.008707631554 -2.483 0.013432
## E -0.161327447529 0.057050198834 -2.828 0.004923
## FP -884.643492393877 352.491658585801 -2.510 0.012479
## BPF -0.629501587254 0.230983379839 -2.725 0.006707
## PPF 0.584792736874 0.231600448463 2.525 0.011956
## salary 0.000000007730 0.000000004537 1.704 0.089183
## HA -0.005934333997 0.004089404824 -1.451 0.147524
##
## (Intercept) **
## SV ***
## RA ***
## CG ***
## X3B ***
## SHO **
## R ***
## IPouts ***
## AB ***
## H ***
## CS **
## GIDP ***
## SF **
## BBA **
## HRA *
## E **
## FP *
## BPF **
## PPF *
## salary .
## HA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.727 on 399 degrees of freedom
## Multiple R-squared: 0.9476, Adjusted R-squared: 0.945
## F-statistic: 360.7 on 20 and 399 DF, p-value: < 2.2e-16
** Richard: Include model and summary(model) here **
** Richard: Include model and summary(model) here **
### Breusch-Pagan Test on Backwards Search - BIC Model
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(bic_model)
##
## studentized Breusch-Pagan test
##
## data: bic_model
## BP = 19.912, df = 20, p-value = 0.4635
plot_fitted_versus_residuals(fitted(bic_model), resid(bic_model), "Backwards Search - BIC Model")
### Shapiro - Wilk Normality Test on Backwards Search - BIC Model ###
shapiro.test(resid(bic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(bic_model)
## W = 0.99328, p-value = 0.05825
qqnorm(resid(bic_model), main = "Normal Q-Q Plot, Backwards Search - BIC Model", col = "darkgrey")
qqline(resid(bic_model), col = "dodgerblue", lwd = 2)
print(paste("Leave One Out Cross-Validated RMSE: ", round(calc_loocv_rmse(bic_model),2)))
## [1] "Leave One Out Cross-Validated RMSE: 2.81"
### Do the number of standard residuals greater than 2 exceed 5% of the total observations -- Backwards Search: BIC Model
std_resid_bic_model <- rstandard(bic_model)[abs(rstandard(bic_model)) > 2]
is_std_resid_gt_five_percent_bic_model <- length(std_resid_bic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent_bic_model,"Outliers Exceed 5% of Obs", "Outliers Do Not Exceed 5% of Obs")
## [1] "Outliers Do Not Exceed 5% of Obs"
#### Backwards Search - BIC Model: Unusual Observtions -- Cooks Distance > 4 / n
cd_bic_model <- cooks.distance(bic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_bic_model)))
## [1] "Number of Influential Observations: 26"
bbproj_trn[which(cd_bic_model),]
## # A tibble: 26 x 51
## yearID franchID W DivWin WCWin LgWin WSWin R AB H X2B
## <int> <fct> <int> <fct> <fct> <fct> <fct> <int> <int> <int> <int>
## 1 2000 CHC 65 N N N N 764 5577 1426 272
## 2 2000 HOU 72 N N N N 938 5570 1547 289
## 3 2000 MIL 73 N N N N 740 5563 1366 297
## 4 2000 TOR 83 N N N N 861 5677 1562 328
## 5 2001 TBD 62 N N N N 672 5524 1426 311
## 6 2002 BOS 93 N N N N 859 5640 1560 348
## 7 2002 MIL 56 N N N N 627 5415 1369 269
## 8 2002 MIN 94 Y N N N 768 5582 1518 348
## 9 2002 OAK 103 Y N N N 800 5558 1450 279
## 10 2003 FLA 91 N Y Y Y 751 5490 1459 292
## # ... with 16 more rows, and 40 more variables: X3B <int>, HR <int>,
## # BB <int>, SO <int>, SB <int>, CS <int>, HBP <int>, SF <int>, RA <int>,
## # ER <int>, ERA <dbl>, CG <int>, SHO <int>, SV <int>, IPouts <int>,
## # HA <int>, HRA <int>, BBA <int>, SOA <int>, E <int>, DP <int>,
## # FP <dbl>, park <fct>, attendance <int>, BPF <int>, PPF <int>,
## # salary <int>, RBI <int>, GIDP <int>, IBB <int>, TB <int>, SLG <dbl>,
## # OBP <dbl>, OPS <dbl>, WHIP <dbl>, BABIP <dbl>, RC <dbl>, X1B <int>,
## # uBB <int>, wOBA <dbl>
### VIF > 5 for Backwards Search: BIC Model Coefficients
library(faraway)
vif_bic_model <- vif(bic_model)
vif_bic_model[which(vif_bic_model > 5)]
## R AB H BB SO RA
## 14.894205 72.403111 929.925966 12.247576 80.989904 6.226679
## E FP BABIP RC
## 48.143435 48.871767 122.560946 226.450179
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:faraway':
##
## melanoma
## Loading required package: ggplot2
bic_model_high_vif_cols <- c("R", "AB", "H", "BB", "SO", "RA", "E", "FP", "BABIP", "RC")
indices_to_drop <- findCorrelation(cor(bbproj_trn[,c(bic_model_high_vif_cols)]), cutoff = 0.6)
vars_to_drop <- bic_model_high_vif_cols[indices_to_drop]
vars_to_drop
## [1] "H" "RC" "R" "FP"
smaller_bic_model <- lm(W ~ AB + BB + SO + SF + RA + CG + SV + IPouts + BBA + BABIP, data = bbproj_trn)
summary(smaller_bic_model)
##
## Call:
## lm(formula = W ~ AB + BB + SO + SF + RA + CG + SV + IPouts +
## BBA + BABIP, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.0090 -3.1465 0.0746 3.1017 13.2658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -61.891791 33.514874 -1.847 0.065513 .
## AB 0.035857 0.005639 6.358 0.000000000546 ***
## BB 0.055611 0.003754 14.812 < 2e-16 ***
## SO -0.008264 0.002508 -3.296 0.001067 **
## SF 0.083700 0.033456 2.502 0.012746 *
## RA -0.070294 0.004609 -15.250 < 2e-16 ***
## CG 0.335688 0.080389 4.176 0.000036315586 ***
## SV 0.599497 0.041742 14.362 < 2e-16 ***
## IPouts -0.019466 0.010220 -1.905 0.057516 .
## BBA -0.008600 0.004530 -1.898 0.058341 .
## BABIP 118.264318 33.616837 3.518 0.000484 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.888 on 409 degrees of freedom
## Multiple R-squared: 0.8274, Adjusted R-squared: 0.8231
## F-statistic: 196 on 10 and 409 DF, p-value: < 2.2e-16
library(lmtest)
library(faraway)
vif(smaller_bic_model)
## AB BB SO SF RA CG SV IPouts
## 3.480184 1.210000 1.693955 1.472823 2.839735 1.243693 1.573034 2.843318
## BBA BABIP
## 1.558630 2.279289
print(paste("Number of coefficients with VIF > 5 : ", sum(vif(smaller_bic_model) > 5)))
## [1] "Number of coefficients with VIF > 5 : 0"
bptest(smaller_bic_model)
##
## studentized Breusch-Pagan test
##
## data: smaller_bic_model
## BP = 13.488, df = 10, p-value = 0.1976
shapiro.test(resid(smaller_bic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_bic_model)
## W = 0.99706, p-value = 0.6567
print(paste("Leave One Out Cross-Validated RMSE: ", round(calc_loocv_rmse(smaller_bic_model),2)))
## [1] "Leave One Out Cross-Validated RMSE: 4.95"
std_resid_bic_model <- rstandard(bic_model)[abs(rstandard(smaller_bic_model)) > 2]
is_std_resid_gt_five_percent_bic_model <- length(std_resid_bic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent_bic_model,"Outliers Exceed 5% of Obs", "Outliers Do Not Exceed 5% of Obs")
## [1] "Outliers Exceed 5% of Obs"
cd_smaller_bic_model <- cooks.distance(smaller_bic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_smaller_bic_model)))
## [1] "Number of Influential Observations: 25"
### Breusch-Pagan Test on AIC Model
library(lmtest)
bptest(aic_model)
##
## studentized Breusch-Pagan test
##
## data: aic_model
## BP = 22.713, df = 23, p-value = 0.4777
plot_fitted_versus_residuals(fitted(aic_model), resid(aic_model), "Backwards Search - AIC Model")
### Shapiro - Wilk Normality Test on Backwards Search - AIC Model ###
shapiro.test(resid(aic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(aic_model)
## W = 0.99355, p-value = 0.07005
qqnorm(resid(aic_model), main = "Normal Q-Q Plot, Backwards Search - AIC Model", col = "darkgrey")
qqline(resid(aic_model), col = "dodgerblue", lwd = 2)
## [1] "Leave One Out Cross-Validated RMSE: 2.79"
### Do the number of standard residuals greater than 2 exceed 5% of the total observations -- Backwards Search: AIC Model
std_resid_aic_model <- rstandard(aic_model)[abs(rstandard(aic_model)) > 2]
is_std_resid_gt_five_percent_aic_model <- length(std_resid_aic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent_aic_model,"Outliers Exceed 5% of Obs", "Outliers Do Not Exceed 5% of Obs")
## [1] "Outliers Do Not Exceed 5% of Obs"
#### Backwards Search - AIC Model: Unusual Observtions -- Cooks Distance > 4 / n
cd_aic_model <- cooks.distance(aic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_aic_model)))
## [1] "Number of Influential Observations: 22"
bbproj_trn[which(cd_aic_model),]
## # A tibble: 22 x 51
## yearID franchID W DivWin WCWin LgWin WSWin R AB H X2B
## <int> <fct> <int> <fct> <fct> <fct> <fct> <int> <int> <int> <int>
## 1 2000 CHC 65 N N N N 764 5577 1426 272
## 2 2000 HOU 72 N N N N 938 5570 1547 289
## 3 2000 KCR 77 N N N N 879 5709 1644 281
## 4 2000 MIL 73 N N N N 740 5563 1366 297
## 5 2000 TOR 83 N N N N 861 5677 1562 328
## 6 2002 BOS 93 N N N N 859 5640 1560 348
## 7 2002 MIL 56 N N N N 627 5415 1369 269
## 8 2002 MIN 94 Y N N N 768 5582 1518 348
## 9 2002 OAK 103 Y N N N 800 5558 1450 279
## 10 2003 FLA 91 N Y Y Y 751 5490 1459 292
## # ... with 12 more rows, and 40 more variables: X3B <int>, HR <int>,
## # BB <int>, SO <int>, SB <int>, CS <int>, HBP <int>, SF <int>, RA <int>,
## # ER <int>, ERA <dbl>, CG <int>, SHO <int>, SV <int>, IPouts <int>,
## # HA <int>, HRA <int>, BBA <int>, SOA <int>, E <int>, DP <int>,
## # FP <dbl>, park <fct>, attendance <int>, BPF <int>, PPF <int>,
## # salary <int>, RBI <int>, GIDP <int>, IBB <int>, TB <int>, SLG <dbl>,
## # OBP <dbl>, OPS <dbl>, WHIP <dbl>, BABIP <dbl>, RC <dbl>, X1B <int>,
## # uBB <int>, wOBA <dbl>
### VIF > 5 for Backwards Search: AIC Model Coefficients
library(faraway)
vif_aic_model <- vif(aic_model)
vif_aic_model[which(vif_aic_model > 5)]
## R AB H BB SO RA
## 15.427663 73.154978 950.861657 12.972390 82.555959 7.100542
## E FP BPF PPF BABIP RC
## 50.740746 51.448558 85.281044 84.114946 125.780508 232.943354
library(caret)
aic_model_high_vif_cols <- c("R", "AB", "H", "X2B", "X3B", "HR", "BB", "SO", "SF", "RA", "ER", "IPouts", "E", "FP","BPF", "PPF", "IBB", "OBP", "BABIP", "RC", "wOBA")
indices_to_drop <- findCorrelation(cor(bbproj_trn[,c(aic_model_high_vif_cols)]), cutoff = 0.6)
vars_to_drop <- aic_model_high_vif_cols[indices_to_drop]
vars_to_drop
## [1] "RC" "wOBA" "R" "OBP" "H" "RA" "BPF" "FP"
#smaller_aic_model <- lm(formula = W ~ AB + HR + BB + SO + CS + ER + CG + SHO + SV + IPouts + BBA + FP + GIDP + IBB + BABIP, data = bbproj)
smaller_aic_model <- lm(formula = W ~ HR + BB + SO + ER + CG + SHO + SV + IPouts + BBA + FP + GIDP + BABIP, data = bbproj_trn)
summary(smaller_aic_model)
##
## Call:
## lm(formula = W ~ HR + BB + SO + ER + CG + SHO + SV + IPouts +
## BBA + FP + GIDP + BABIP, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3371 -2.5236 -0.0841 2.1182 10.2418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -328.428863 67.755211 -4.847 0.00000178 ***
## HR 0.132223 0.005998 22.044 < 2e-16 ***
## BB 0.032238 0.002798 11.523 < 2e-16 ***
## SO -0.023360 0.001674 -13.952 < 2e-16 ***
## ER -0.068993 0.003633 -18.993 < 2e-16 ***
## CG 0.189266 0.057990 3.264 0.001192 **
## SHO 0.235242 0.061006 3.856 0.000134 ***
## SV 0.410129 0.030171 13.593 < 2e-16 ***
## IPouts 0.014815 0.005185 2.857 0.004495 **
## BBA -0.008063 0.003203 -2.517 0.012216 *
## FP 278.765319 69.652956 4.002 0.00007457 ***
## GIDP -0.037475 0.012793 -2.929 0.003589 **
## BABIP 316.088714 16.896493 18.707 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.435 on 407 degrees of freedom
## Multiple R-squared: 0.9152, Adjusted R-squared: 0.9127
## F-statistic: 365.9 on 12 and 407 DF, p-value: < 2.2e-16
library(lmtest)
library(faraway)
vif(smaller_aic_model)
## HR BB SO ER CG SHO SV IPouts
## 1.426241 1.360620 1.529067 3.064298 1.310541 1.907533 1.664221 1.482259
## BBA FP GIDP BABIP
## 1.578259 1.270639 1.409269 1.166015
print(paste("Number of coefficients with VIF > 5 : ", sum(vif(smaller_aic_model) > 5)))
## [1] "Number of coefficients with VIF > 5 : 0"
bptest(smaller_aic_model)
##
## studentized Breusch-Pagan test
##
## data: smaller_aic_model
## BP = 8.7708, df = 12, p-value = 0.7224
shapiro.test(resid(smaller_aic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_aic_model)
## W = 0.99241, p-value = 0.03137
print(paste("Leave One Out Cross-Validated RMSE: ", round(calc_loocv_rmse(smaller_aic_model),2)))
## [1] "Leave One Out Cross-Validated RMSE: 3.49"
std_resid_aic_model <- rstandard(smaller_aic_model)[abs(rstandard(smaller_aic_model)) > 2]
is_std_resid_gt_five_percent_aic_model <- length(std_resid_aic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent_aic_model,"Outliers Exceed 5% of Obs", "Outliers Do Not Exceed 5% of Obs")
## [1] "Outliers Exceed 5% of Obs"
cd_smaller_aic_model <- cooks.distance(smaller_aic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_smaller_bic_model)))
## [1] "Number of Influential Observations: 25"
### Breusch-Pagan Test on Step BIC Model
bptest(step_bic_model)
##
## studentized Breusch-Pagan test
##
## data: step_bic_model
## BP = 3.0387, df = 7, p-value = 0.8814
plot_fitted_versus_residuals(fitted(step_bic_model), resid(step_bic_model), "Step Search - BIC Model")
### Shapiro - Wilk Normality Test on Step Search - BIC Model ###
shapiro.test(resid(step_bic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(step_bic_model)
## W = 0.99708, p-value = 0.6604
qqnorm(resid(step_bic_model), main = "Normal Q-Q Plot, Step Search - BIC Model", col = "darkgrey")
qqline(resid(step_bic_model), col = "dodgerblue", lwd = 2)
## [1] "Leave One Out Cross-Validated RMSE: 3.04"
### Do the number of standard residuals greater than 2 exceed 5% of the total observations -- Step Search: BIC Model
std_resid_step_bic_model <- rstandard(step_bic_model)[abs(rstandard(step_bic_model)) > 2]
is_std_resid_gt_five_percent__step_bic_model <- length(std_resid_step_bic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent__step_bic_model,"Exceeds 5% of Obs", "Does Not Exceed 5% of Obs")
## [1] "Does Not Exceed 5% of Obs"
#### Step Search - BIC Model: Unusual Observtions -- Cooks Distance > 4 / n
cd_step_bic_model <- cooks.distance(step_bic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_step_bic_model)))
## [1] "Number of Influential Observations: 23"
bbproj_trn[which(cd_step_bic_model),]
## # A tibble: 23 x 51
## yearID franchID W DivWin WCWin LgWin WSWin R AB H X2B
## <int> <fct> <int> <fct> <fct> <fct> <fct> <int> <int> <int> <int>
## 1 2000 HOU 72 N N N N 938 5570 1547 289
## 2 2000 TOR 83 N N N N 861 5677 1562 328
## 3 2002 BOS 93 N N N N 859 5640 1560 348
## 4 2003 FLA 91 N Y Y Y 751 5490 1459 292
## 5 2003 HOU 87 N N N N 805 5583 1466 308
## 6 2003 SFG 100 Y N N N 755 5456 1440 281
## 7 2004 NYY 101 Y N N N 897 5527 1483 281
## 8 2004 OAK 91 N N N N 793 5728 1545 336
## 9 2005 ARI 77 N N N N 696 5550 1419 291
## 10 2006 CLE 78 N N N N 870 5619 1576 351
## # ... with 13 more rows, and 40 more variables: X3B <int>, HR <int>,
## # BB <int>, SO <int>, SB <int>, CS <int>, HBP <int>, SF <int>, RA <int>,
## # ER <int>, ERA <dbl>, CG <int>, SHO <int>, SV <int>, IPouts <int>,
## # HA <int>, HRA <int>, BBA <int>, SOA <int>, E <int>, DP <int>,
## # FP <dbl>, park <fct>, attendance <int>, BPF <int>, PPF <int>,
## # salary <int>, RBI <int>, GIDP <int>, IBB <int>, TB <int>, SLG <dbl>,
## # OBP <dbl>, OPS <dbl>, WHIP <dbl>, BABIP <dbl>, RC <dbl>, X1B <int>,
## # uBB <int>, wOBA <dbl>
### VIF > 5 for Step Search: BIC Model Coefficients
library(faraway)
vif_step_bic_model <- vif(step_bic_model)
vif_step_bic_model[which(vif_step_bic_model > 5)]
## OBP R
## 5.221787 5.499516
smaller_step_bic_model <- lm(formula = W ~ SV + R + RA + SHO + CG + X3B + IPouts + AB + salary, data = bbproj_trn)
summary(smaller_step_bic_model)
##
## Call:
## lm(formula = W ~ SV + R + RA + SHO + CG + X3B + IPouts + AB +
## salary, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.6930 -2.0156 0.0027 1.8031 8.6432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.64791877157 19.43171439428 0.857 0.39209
## SV 0.39794921940 0.02548723971 15.614 < 2e-16 ***
## R 0.09465788997 0.00239639476 39.500 < 2e-16 ***
## RA -0.07000030397 0.00295754734 -23.668 < 2e-16 ***
## SHO 0.17217667234 0.05222372960 3.297 0.00106 **
## CG 0.14266098090 0.04629864831 3.081 0.00220 **
## X3B -0.04901240782 0.01691431490 -2.898 0.00396 **
## IPouts 0.02307347246 0.00528945937 4.362 0.0000163 ***
## AB -0.01298730710 0.00293489287 -4.425 0.0000124 ***
## salary 0.00000001035 0.00000000449 2.305 0.02167 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.95 on 410 degrees of freedom
## Multiple R-squared: 0.937, Adjusted R-squared: 0.9356
## F-statistic: 677.2 on 9 and 410 DF, p-value: < 2.2e-16
library(lmtest)
library(faraway)
vif(smaller_step_bic_model)
## SV R RA SHO CG X3B IPouts AB
## 1.610244 1.941649 3.209850 1.895339 1.132668 1.067672 2.091258 2.587975
## salary
## 1.267406
print(paste("Number of coefficients with VIF > 5 : ", sum(vif(smaller_step_bic_model) > 5)))
## [1] "Number of coefficients with VIF > 5 : 0"
bptest(smaller_step_bic_model)
##
## studentized Breusch-Pagan test
##
## data: smaller_step_bic_model
## BP = 4.372, df = 9, p-value = 0.8853
shapiro.test(resid(smaller_step_bic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_step_bic_model)
## W = 0.9967, p-value = 0.549
print(paste("Leave One Out Cross-Validated RMSE: ", round(calc_loocv_rmse(smaller_step_bic_model),2)))
## [1] "Leave One Out Cross-Validated RMSE: 2.99"
std_resid_step_bic_model <- rstandard(smaller_step_bic_model)[abs(rstandard(smaller_step_bic_model)) > 2]
is_std_resid_gt_five_percent_step_bic_model <- length(std_resid_step_bic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent_step_bic_model,"Outliers Exceed 5% of Obs", "Outliers Do Not Exceed 5% of Obs")
## [1] "Outliers Do Not Exceed 5% of Obs"
cd_smaller_step_bic_model <- cooks.distance(smaller_step_bic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_smaller_step_bic_model)))
## [1] "Number of Influential Observations: 25"
### Breusch-Pagan Test on Step AIC Model
bptest(step_aic_model)
##
## studentized Breusch-Pagan test
##
## data: step_aic_model
## BP = 22.885, df = 20, p-value = 0.2945
plot_fitted_versus_residuals(fitted(step_aic_model), resid(step_aic_model), "Step Search - AIC Model")
### Shapiro - Wilk Normality Test on Step Search - AIC Model ###
shapiro.test(resid(step_aic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(step_aic_model)
## W = 0.99683, p-value = 0.5883
qqnorm(resid(step_aic_model), main = "Normal Q-Q Plot, Step Search - AIC Model", col = "darkgrey")
qqline(resid(step_aic_model), col = "dodgerblue", lwd = 2)
## [1] "Leave One Out Cross-Validated RMSE: 2.8"
### Do the number of standard residuals greater than 2 exceed 5% of the total observations -- Step Search: AIC Model
std_resid_step_aic_model <- rstandard(step_aic_model)[abs(rstandard(step_aic_model)) > 2]
is_std_resid_gt_five_percent__step_aic_model <- length(std_resid_step_aic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent__step_aic_model,"Exceeds 5% of Obs", "Does Not Exceed 5% of Obs")
## [1] "Does Not Exceed 5% of Obs"
#### Step Search - AIC Model: Unusual Observtions -- Cooks Distance > 4 / n
cd_step_aic_model <- cooks.distance(step_aic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_step_aic_model)))
## [1] "Number of Influential Observations: 29"
bbproj_trn[which(cd_step_aic_model),]
## # A tibble: 29 x 51
## yearID franchID W DivWin WCWin LgWin WSWin R AB H X2B
## <int> <fct> <int> <fct> <fct> <fct> <fct> <int> <int> <int> <int>
## 1 2000 CHC 65 N N N N 764 5577 1426 272
## 2 2000 HOU 72 N N N N 938 5570 1547 289
## 3 2000 KCR 77 N N N N 879 5709 1644 281
## 4 2000 MIL 73 N N N N 740 5563 1366 297
## 5 2000 SFG 97 Y N N N 925 5519 1535 304
## 6 2000 TOR 83 N N N N 861 5677 1562 328
## 7 2001 COL 73 N N N N 923 5690 1663 324
## 8 2002 BOS 93 N N N N 859 5640 1560 348
## 9 2002 MIL 56 N N N N 627 5415 1369 269
## 10 2002 MIN 94 Y N N N 768 5582 1518 348
## # ... with 19 more rows, and 40 more variables: X3B <int>, HR <int>,
## # BB <int>, SO <int>, SB <int>, CS <int>, HBP <int>, SF <int>, RA <int>,
## # ER <int>, ERA <dbl>, CG <int>, SHO <int>, SV <int>, IPouts <int>,
## # HA <int>, HRA <int>, BBA <int>, SOA <int>, E <int>, DP <int>,
## # FP <dbl>, park <fct>, attendance <int>, BPF <int>, PPF <int>,
## # salary <int>, RBI <int>, GIDP <int>, IBB <int>, TB <int>, SLG <dbl>,
## # OBP <dbl>, OPS <dbl>, WHIP <dbl>, BABIP <dbl>, RC <dbl>, X1B <int>,
## # uBB <int>, wOBA <dbl>
### VIF > 5 for Step Search: AIC Model Coefficients
library(faraway)
vif_step_aic_model <- vif(step_aic_model)
vif_step_aic_model[which(vif_step_aic_model > 5)]
## RA R AB H E FP BPF
## 17.262491 5.084637 12.376016 17.104981 50.597130 51.635315 82.305380
## PPF HA
## 80.349149 7.123143
smaller_step_aic_model <- lm(formula = W ~ SV + R + SHO + CG + X3B + IPouts + BBA + HRA + SOA, data = bbproj_trn)
summary(smaller_step_aic_model)
##
## Call:
## lm(formula = W ~ SV + R + SHO + CG + X3B + IPouts + BBA + HRA +
## SOA, data = bbproj_trn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9548 -2.9275 0.2111 2.5960 15.3693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -106.910065 24.782771 -4.314 0.000020123812728 ***
## SV 0.553970 0.033062 16.756 < 2e-16 ***
## R 0.082172 0.002520 32.612 < 2e-16 ***
## SHO 0.459052 0.068216 6.729 0.000000000057562 ***
## CG 0.295343 0.063584 4.645 0.000004589228066 ***
## X3B -0.063341 0.022615 -2.801 0.00534 **
## IPouts 0.026697 0.005825 4.583 0.000006087870449 ***
## BBA -0.027269 0.003450 -7.905 0.000000000000025 ***
## HRA -0.098460 0.009868 -9.978 < 2e-16 ***
## SOA 0.013936 0.001999 6.970 0.000000000012736 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.006 on 410 degrees of freedom
## Multiple R-squared: 0.8837, Adjusted R-squared: 0.8812
## F-statistic: 346.3 on 9 and 410 DF, p-value: < 2.2e-16
library(lmtest)
library(faraway)
vif(smaller_step_aic_model)
## SV R SHO CG X3B IPouts BBA HRA
## 1.468999 1.163807 1.753261 1.158178 1.034795 1.375007 1.345455 1.577315
## SOA
## 1.440348
print(paste("Number of coefficients with VIF > 5 : ",sum(vif(smaller_step_aic_model) > 5)))
## [1] "Number of coefficients with VIF > 5 : 0"
bptest(smaller_step_aic_model)
##
## studentized Breusch-Pagan test
##
## data: smaller_step_aic_model
## BP = 3.5705, df = 9, p-value = 0.9373
shapiro.test(resid(smaller_step_aic_model))
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_step_aic_model)
## W = 0.99481, p-value = 0.1694
print(paste("Leave One Out Cross-Validated RMSE: ", round(calc_loocv_rmse(smaller_step_aic_model),2)))
## [1] "Leave One Out Cross-Validated RMSE: 4.05"
std_resid_step_aic_model <- rstandard(smaller_step_aic_model)[abs(rstandard(smaller_step_aic_model)) > 2]
is_std_resid_gt_five_percent_step_aic_model <- length(std_resid_step_aic_model) / n > 0.05
ifelse(is_std_resid_gt_five_percent_step_aic_model,"Outliers Exceed 5% of Obs", "Outliers Do Not Exceed 5% of Obs")
## [1] "Outliers Do Not Exceed 5% of Obs"
cd_smaller_step_aic_model <- cooks.distance(smaller_step_aic_model) > 4 / n
print(paste("Number of Influential Observations: ", sum(cd_smaller_step_aic_model)))
## [1] "Number of Influential Observations: 17"
** Richard: use same diagrams and approach I did. (Copy and paste my code and modify) **
** Richard: use same approach I did. (Copy and paste my code and modify)**
** Richard: use same approach I did. (Copy and paste my code and modify).**
** Richard: use same approach I did. (Copy and paste my code and modify.) **
** Richard: use same diagrams and approach I did. (Copy and paste my code and modify) **
** Richard: use same approach I did. (Copy and paste my code and modify)**
** Richard: use same approach I did. (Copy and paste my code and modify).**
** Richard: use same approach I did. (Copy and paste my code and modify.) **
** Richard: Copy and paste the 2 above code blocks for Plotting predicted vs actual wins for your smaller (best) Forward BIC Model. Modify as required. **
** Richard: Copy and paste the 2 above code blocks for Plotting predicted vs actual wins for your smaller (best) Forward AIC Model. Modify as required. **
Let’s turn our attention to logistic regression and our ability to classify and predict division winners using our team predictor set. First, let’s see how well salary alone can predict pennants.
salary_only_model <- glm(DivWin ~ salary, data = bbproj_trn, family = binomial)
(salary_only_summary = summary(salary_only_model))
##
## Call:
## glm(formula = DivWin ~ salary, family = binomial, data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5538 -0.6746 -0.5664 -0.4673 2.1545
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.693550028350 0.317777033524 -8.476 < 2e-16 ***
## salary 0.000000015283 0.000000003249 4.704 0.00000255 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 397.38 on 418 degrees of freedom
## AIC: 401.38
##
## Number of Fisher Scoring iterations: 4
plot(as.numeric(DivWin) - 1 ~ salary, data = bbproj_trn,
pch = 20,
main = "Probability of Buying a Division Winner",
ylab = "Probability of Winning Division",
xlab = "Team Salary ($)",
xlim = c(0, 3e8))
curve(predict(salary_only_model, data.frame(salary = x), type = "response"),
add = TRUE,
col = "tomato",
lty = 2,
lwd = 2)
Graphically, a team’s payroll seems to be moderately influential in predicting their chances of taking home a division crown. Indeed, our Wald test for salary alone yields a p-value of 0.0000025, allowing us to reject the null hypothesis (\(H_0 : \beta_{salary} = 0\)) for any reasonable value for \(\alpha\). So what happens when we look at our salary model’s misclassification rate?
salary_prediction <- ifelse(predict(salary_only_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(prevalence = table(bbproj_tst$DivWin) / nrow(bbproj_tst))
##
## N Y
## 0.8 0.2
(salary_misclass = mean(salary_prediction != bbproj_tst$DivWin))
## [1] 0.2222222
First, let’s examine prevalence of division winners. We see that 20% of the teams were division winners, which makes sense, since there are 6 divisions and 30 MLB teams, therefore you will only have 6 division winners per year. Our salary model has a misclassification rate of 0.2222222, which is worse than our prevalence. We would have a better misclassification rate if we simply stated that there are no division winners! We can certainly do better than this.
We will begin our search for a better classifier by setting up an initial logisitc regression model contain all of the predictors we would like to evaluate. Then, we will proceed to eliminate predictors using backwards, forwards, and stepwise AIC and BIC searches. See Appendix C for a complete evaluation of all of the models. The following will detail the methodology using our best candiate model.
scope <- DivWin ~ W + R + AB + H + X2B + X3B + HR + BB + SO + SB + CS + HBP + SF + RA + ER + ERA + CG + SHO + SV + IPouts + HA + HRA + BBA + SOA + E + DP + FP + attendance + BPF + PPF + salary + RBI + GIDP + IBB + TB + SLG + OBP + OPS + WHIP + BABIP + RC + X1B + uBB + wOBA
start_model <- glm(DivWin ~ 1, data = bbproj_trn, family = binomial)
n <- length(resid(start_model))
bic_model <- step(start_model, direction = "forward", scope = scope, k = log(n), trace = 0)
summary(bic_model)
##
## Call:
## glm(formula = DivWin ~ W + X2B, family = binomial, data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.35987 -0.30139 -0.06081 -0.00867 2.59348
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -27.523331 3.832156 -7.182 0.000000000000686 ***
## W 0.350030 0.042623 8.212 < 2e-16 ***
## X2B -0.016344 0.006758 -2.418 0.0156 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 194.55 on 417 degrees of freedom
## AIC: 200.55
##
## Number of Fisher Scoring iterations: 7
Our forward search using BIC yields the following logistic model
\[ \log \bigg(\frac{P[DivWin = 1]}{1 - P[DivWin = 1]} \bigg) = \beta_0 + \beta_W W + \beta_{X2B} X2B \]
where the log odds of a team winning their division is dependent upon wins and doubles.
Next, we will evaluate this model using a confusion matrix and calculating its sensitivity, specificity, and its misclassification rate, starting with a cutoff value of 0.5.
bic_prediction <- ifelse(predict(bic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(bic_misclass = mean(bic_prediction != bbproj_tst$DivWin))
## [1] 0.1
(bic_conf_matrix = table(prediction = bic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 7
## Y 2 11
(bic_sensitivity = bic_conf_matrix[2, 2] / sum(bic_conf_matrix[, 2]))
## [1] 0.6111111
(bic_specificity = bic_conf_matrix[1, 1] / sum(bic_conf_matrix[, 1]))
## [1] 0.9722222
Here, we beat the salary only model along with the prevalence with a misclassification rate of 0.1. Our specificity looks great for our forward BIC model at 0.972. Our 7 false negatives could come down, however, with an adjustment to our cutoff value.
The following function will loop through a vector of potential cutoffs to isolate one that will produce the smallest misclassification rate with a minimal differential between sensitivity and specificity:
opt_logistic_cutoff = function(model, cut_start = 0.01, cut_end = 0.99, plotit = TRUE) {
# Loop through potential cutoffs from cut_start to cut_end to determine a cutoff that
# produces the lowest misclassification rate with the smallest delta between sensitivity
# and specificity
cutoffs = seq(cut_start, cut_end, by = 0.01)
sens = rep(0, length(cutoffs))
spec = rep(0, length(cutoffs))
misclass = rep(0, length(cutoffs))
delta = rep(0, length(cutoffs))
for (i in 1:length(cutoffs)) {
pred = ifelse(predict(model, bbproj_tst, type = "response") > cutoffs[i], "Y", "N")
conf_mat = table(prediction = pred, actual = bbproj_tst$DivWin)
sens[i] = conf_mat[2, 2] / sum(conf_mat[, 2])
spec[i] = conf_mat[1, 1] / sum(conf_mat[, 1])
misclass[i] = mean(pred != bbproj_tst$DivWin)
delta[i] = abs(sens[i] - spec[i])
}
# Get the indicies of the smallest misclassification rates
min_misclass = which(misclass == min(misclass))
# Get the smallest delta between sensitivity and specificity at the identified
# misclassification indicies
min_delta = min_misclass[which.min(delta[min_misclass])]
# Plot sensitivity, specificity, and misclassification if requested
if (plotit) {
plot(sens ~ cutoffs,
xlab = "Cutoff",
ylab = "Sensitivity/Specificity",
main = "Sensitivity and Specificity at Varied Cutoffs",
col = "tomato",
type = "b",
ylim = c(0, 1),
pch = 20)
lines(spec ~ cutoffs,
col = "darkslategray4",
type = "b",
pch = 20)
lines(misclass ~ cutoffs,
col = "darkorange",
type = "b",
pch = 20)
abline(v = cutoffs[min_delta], lty = 2)
legend("topright", c("Sensitivity", "Specificity", "Misclassification"),
col = c("tomato", "darkslategray4", "darkorange"),
lwd = 1,
pch = 20)
}
# Return the misclassification, sensitivity, and specificity at the optimal
# cutoff value
c(cutoff = cutoffs[min_delta],
misclass = misclass[min_delta],
sensitivity = sens[min_delta],
specificity = spec[min_delta])
}
(opt_cutoff = opt_logistic_cutoff(bic_model, cut_start = 0.1, cut_end = 0.8))
## cutoff misclass sensitivity specificity
## 0.30000000 0.05555556 0.88888889 0.95833333
Our routine finds an optimal cutoff value for our forward BIC model of 0.3. Let’s proceed to evaluate the diagnostics manually:
bic_prediction_3 <- ifelse(predict(bic_model, bbproj_tst, type = "response") > 0.3, "Y", "N")
(bic_misclass_3 = mean(bic_prediction_3 != bbproj_tst$DivWin))
## [1] 0.05555556
(bic_conf_matrix_3 = table(prediction = bic_prediction_3, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 69 2
## Y 3 16
(bic_sensitivity_3 = bic_conf_matrix_3[2, 2] / sum(bic_conf_matrix_3[, 2]))
## [1] 0.8888889
(bic_specificity_3 = bic_conf_matrix_3[1, 1] / sum(bic_conf_matrix_3[, 1]))
## [1] 0.9583333
Our misclassification rate went down with our cutoff adjustment to 0.056. Additionally, our sensitivity went up significantly (0.889) and we only had a modest reduction in specificity (0.958). This model looks to be exceptionally adept at classifying division winners, and salary is not even a predictor!
scope <- DivWin ~ W + R + AB + H + X2B + X3B + HR + BB + SO + SB + CS + HBP + SF + RA + ER + ERA + CG + SHO + SV + IPouts + HA + HRA + BBA + SOA + E + DP + FP + attendance + BPF + PPF + salary + RBI + GIDP + IBB + TB + SLG + OBP + OPS + WHIP + BABIP + RC + X1B + uBB + wOBA
formula <- formula(DivWin ~ 1)
start_model <- glm(formula, data = bbproj_trn, family = binomial)
n <- length(resid(start_model))
formula <- formula(scope)
back_bic_model <- step(glm(formula, data = bbproj_trn, family = binomial),
direction = "backward", k = log(n), trace = 0)
summary(back_bic_model)
##
## Call:
## glm(formula = DivWin ~ W + AB + H + HR + BB + HBP + IBB + SLG +
## OBP + wOBA, family = binomial, data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.09816 -0.27642 -0.04275 -0.00332 2.76522
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1052.50217 319.70312 -3.292 0.000994 ***
## W 0.37031 0.04793 7.726 0.0000000000000111 ***
## AB 0.17509 0.05585 3.135 0.001717 **
## H -0.52892 0.16383 -3.228 0.001245 **
## HR 0.09433 0.03603 2.618 0.008842 **
## BB -0.34709 0.10338 -3.357 0.000787 ***
## HBP -0.33911 0.09686 -3.501 0.000463 ***
## IBB -0.30746 0.12140 -2.533 0.011320 *
## SLG 1794.92000 702.65755 2.554 0.010635 *
## OBP 6178.22128 2030.78091 3.042 0.002348 **
## wOBA -5414.02198 2077.07626 -2.607 0.009146 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 179.06 on 409 degrees of freedom
## AIC: 201.06
##
## Number of Fisher Scoring iterations: 8
back_bic_prediction <- ifelse(predict(back_bic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(back_bic_misclass = mean(back_bic_prediction != bbproj_tst$DivWin))
## [1] 0.1
(back_bic_conf_matrix = table(prediction = back_bic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 7
## Y 2 11
(back_bic_sensitivity = back_bic_conf_matrix[2, 2] / sum(back_bic_conf_matrix[, 2]))
## [1] 0.6111111
(back_bic_specificity = back_bic_conf_matrix[1, 1] / sum(back_bic_conf_matrix[, 1]))
## [1] 0.9722222
opt_logistic_cutoff(back_bic_model, cut_start = 0.1, cut_end = 0.8)
## cutoff misclass sensitivity specificity
## 0.26000000 0.05555556 0.83333333 0.97222222
forw_bic_model <- step(start_model, direction = "forward", scope = scope, k = log(n), trace = 0)
summary(forw_bic_model)
##
## Call:
## glm(formula = DivWin ~ W + X2B, family = binomial, data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.35987 -0.30139 -0.06081 -0.00867 2.59348
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -27.523331 3.832156 -7.182 0.000000000000686 ***
## W 0.350030 0.042623 8.212 < 2e-16 ***
## X2B -0.016344 0.006758 -2.418 0.0156 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 194.55 on 417 degrees of freedom
## AIC: 200.55
##
## Number of Fisher Scoring iterations: 7
forw_bic_prediction <- ifelse(predict(forw_bic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(forw_bic_misclass = mean(forw_bic_prediction != bbproj_tst$DivWin))
## [1] 0.1
(forw_bic_conf_matrix = table(prediction = forw_bic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 7
## Y 2 11
(forw_bic_sensitivity = forw_bic_conf_matrix[2, 2] / sum(forw_bic_conf_matrix[, 2]))
## [1] 0.6111111
(forw_bic_specificity = forw_bic_conf_matrix[1, 1] / sum(forw_bic_conf_matrix[, 1]))
## [1] 0.9722222
opt_logistic_cutoff(forw_bic_model, cut_start = 0.1, cut_end = 0.8)
## cutoff misclass sensitivity specificity
## 0.30000000 0.05555556 0.88888889 0.95833333
step_bic_model <- step(start_model, direction = "both", scope = scope, k = log(n), trace = 0)
summary(step_bic_model)
##
## Call:
## glm(formula = DivWin ~ W + X2B, family = binomial, data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.35987 -0.30139 -0.06081 -0.00867 2.59348
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -27.523331 3.832156 -7.182 0.000000000000686 ***
## W 0.350030 0.042623 8.212 < 2e-16 ***
## X2B -0.016344 0.006758 -2.418 0.0156 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 194.55 on 417 degrees of freedom
## AIC: 200.55
##
## Number of Fisher Scoring iterations: 7
step_bic_prediction <- ifelse(predict(step_bic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(step_bic_misclass = mean(step_bic_prediction != bbproj_tst$DivWin))
## [1] 0.1
(step_bic_conf_matrix = table(prediction = step_bic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 7
## Y 2 11
(step_bic_sensitivity = step_bic_conf_matrix[2, 2] / sum(step_bic_conf_matrix[, 2]))
## [1] 0.6111111
(step_bic_specificity = step_bic_conf_matrix[1, 1] / sum(step_bic_conf_matrix[, 1]))
## [1] 0.9722222
opt_logistic_cutoff(step_bic_model, cut_start = 0.1, cut_end = 0.8)
## cutoff misclass sensitivity specificity
## 0.30000000 0.05555556 0.88888889 0.95833333
formula <- formula(scope)
back_aic_model <- step(glm(formula, data = bbproj_trn, family = binomial),
direction = "backward", trace = 0)
summary(back_aic_model)
##
## Call:
## glm(formula = DivWin ~ W + R + AB + H + HR + BB + CS + HBP +
## ER + IPouts + E + FP + BPF + PPF + RBI + IBB + SLG + OBP +
## OPS + wOBA, family = binomial, data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.96822 -0.20843 -0.02111 -0.00065 2.65771
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -140.179255 693.655494 -0.202 0.839848
## W 0.545656 0.090798 6.010 0.00000000186 ***
## R -0.076857 0.035782 -2.148 0.031717 *
## AB 0.183520 0.063041 2.911 0.003601 **
## H -0.592491 0.183941 -3.221 0.001277 **
## HR 0.079762 0.041066 1.942 0.052099 .
## BB -0.401042 0.116424 -3.445 0.000572 ***
## CS -0.059004 0.023336 -2.528 0.011457 *
## HBP -0.393193 0.110648 -3.554 0.000380 ***
## ER 0.024676 0.008386 2.943 0.003255 **
## IPouts 0.022175 0.011255 1.970 0.048806 *
## E -0.168360 0.092043 -1.829 0.067378 .
## FP -1111.221700 583.517366 -1.904 0.056865 .
## BPF 0.736250 0.339446 2.169 0.030084 *
## PPF -0.674039 0.343047 -1.965 0.049431 *
## RBI 0.070973 0.036748 1.931 0.053438 .
## IBB -0.299451 0.134303 -2.230 0.025769 *
## SLG 851959707.701375 417793051.690715 2.039 0.041431 *
## OBP 851964540.159953 417793092.777287 2.039 0.041430 *
## OPS -851957944.218280 417793035.107366 -2.039 0.041431 *
## wOBA -5332.125383 2319.746099 -2.299 0.021529 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 151.22 on 399 degrees of freedom
## AIC: 193.22
##
## Number of Fisher Scoring iterations: 8
back_aic_prediction <- ifelse(predict(back_aic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(back_aic_misclass = mean(back_aic_prediction != bbproj_tst$DivWin))
## [1] 0.07777778
(back_aic_conf_matrix = table(prediction = back_aic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 5
## Y 2 13
(back_aic_sensitivity = back_aic_conf_matrix[2, 2] / sum(back_aic_conf_matrix[, 2]))
## [1] 0.7222222
(back_aic_specificity = back_aic_conf_matrix[1, 1] / sum(back_aic_conf_matrix[, 1]))
## [1] 0.9722222
opt_logistic_cutoff(back_aic_model, cut_start = 0.1, cut_end = 0.8)
## cutoff misclass sensitivity specificity
## 0.26000000 0.05555556 0.88888889 0.95833333
back_aic_prediction <- ifelse(predict(back_aic_model, bbproj_tst, type = "response") > 0.3, "Y", "N")
(back_aic_misclass = mean(back_aic_prediction != bbproj_tst$DivWin))
## [1] 0.05555556
(back_aic_conf_matrix = table(prediction = back_aic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 69 2
## Y 3 16
(back_aic_sensitivity = back_aic_conf_matrix[2, 2] / sum(back_aic_conf_matrix[, 2]))
## [1] 0.8888889
(back_aic_specificity = back_aic_conf_matrix[1, 1] / sum(back_aic_conf_matrix[, 1]))
## [1] 0.9583333
forw_aic_model <- step(start_model, direction = "forward", scope = scope, trace = 0)
summary(forw_aic_model)
##
## Call:
## glm(formula = DivWin ~ W + X2B + HA + DP + GIDP, family = binomial,
## data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.31485 -0.25596 -0.04893 -0.00565 2.90172
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -37.894889 6.186389 -6.126 9.04e-10 ***
## W 0.392678 0.048963 8.020 1.06e-15 ***
## X2B -0.024866 0.007752 -3.208 0.00134 **
## HA 0.006988 0.002855 2.447 0.01440 *
## DP -0.024295 0.012063 -2.014 0.04401 *
## GIDP 0.021012 0.012242 1.716 0.08609 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 182.92 on 414 degrees of freedom
## AIC: 194.92
##
## Number of Fisher Scoring iterations: 7
forw_aic_prediction <- ifelse(predict(forw_aic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(forw_aic_misclass = mean(forw_aic_prediction != bbproj_tst$DivWin))
## [1] 0.1111111
(forw_aic_conf_matrix = table(prediction = forw_aic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 8
## Y 2 10
(forw_aic_sensitivity = forw_aic_conf_matrix[2, 2] / sum(forw_aic_conf_matrix[, 2]))
## [1] 0.5555556
(forw_aic_specificity = forw_aic_conf_matrix[1, 1] / sum(forw_aic_conf_matrix[, 1]))
## [1] 0.9722222
opt_logistic_cutoff(forw_aic_model, cut_start = 0.1, cut_end = 0.8)
## cutoff misclass sensitivity specificity
## 0.23000000 0.07777778 0.94444444 0.91666667
step_aic_model <- step(start_model, direction = "both", scope = scope, trace = 0)
summary(step_aic_model)
##
## Call:
## glm(formula = DivWin ~ W + X2B + HA + DP + GIDP, family = binomial,
## data = bbproj_trn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.31485 -0.25596 -0.04893 -0.00565 2.90172
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -37.894889 6.186389 -6.126 9.04e-10 ***
## W 0.392678 0.048963 8.020 1.06e-15 ***
## X2B -0.024866 0.007752 -3.208 0.00134 **
## HA 0.006988 0.002855 2.447 0.01440 *
## DP -0.024295 0.012063 -2.014 0.04401 *
## GIDP 0.021012 0.012242 1.716 0.08609 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 420.34 on 419 degrees of freedom
## Residual deviance: 182.92 on 414 degrees of freedom
## AIC: 194.92
##
## Number of Fisher Scoring iterations: 7
step_aic_prediction <- ifelse(predict(step_aic_model, bbproj_tst, type = "response") > 0.5, "Y", "N")
(step_aic_misclass = mean(step_aic_prediction != bbproj_tst$DivWin))
## [1] 0.1111111
(step_aic_conf_matrix = table(prediction = step_aic_prediction, actual = bbproj_tst$DivWin))
## actual
## prediction N Y
## N 70 8
## Y 2 10
(step_aic_sensitivity = step_aic_conf_matrix[2, 2] / sum(step_aic_conf_matrix[, 2]))
## [1] 0.5555556
(step_aic_specificity = step_aic_conf_matrix[1, 1] / sum(step_aic_conf_matrix[, 1]))
## [1] 0.9722222
opt_logistic_cutoff(step_aic_model, cut_start = 0.1, cut_end = 0.8)
## cutoff misclass sensitivity specificity
## 0.23000000 0.07777778 0.94444444 0.91666667